library(plotly)
library(tidyverse)
library(scales)
library(patchwork)
library(colorspace)
options(scipen=999)
load("C:/Users/Andres/korpused/publicvalue/cinando/drafts.RData")
# VZ_Films containing full film information with a separate row for each movie,
# VZ_FilmLanguage for languages spoken in films where each film may be listed multiple times if there are multiple languages spoken,
# VZ_FilmOrigin with the same structure as VZ_FilmLanguage only for production origins of films, and
# VZ_FilmGenre with the same structure only for genres.
# Perhaps you could take a look at those to see if we could show them a bit of a summary on what films are listed in the database?
# look at Languages, Origins and Genres from the separate tables + you could also plot the following from the main film table: Duration, Budget, Seller, and then
# perhaps some quantitative stats such as the ones starting with Adm (for admissions), Bo (for box office), NBOfScreens (for the number of screens).
# It may also be interesting for you to look at SynopsisVA as it contains text.
# VZ_FilmCrewOnFilms query. This one connects film crews with the films they worked on together with their roles. From this query we could build a social network to show which people work on which films. However, the crew data is very messy. You will see that the same person is often entered onto the database multiple times with new RefFilmCrew IDs
# festival networks, but maybe try producers-festival networks, or cluster producers by festival. something with temporal networks too?
csvs = list.files("C:/Users/Andres/korpused/cinando/CSV exports", full.names = T)
dat = sapply(csvs, read.csv, header = 1, na.strings = "" )
names(dat) = basename(csvs) %>% gsub("^VZ_|.csv$","",.)
sapply(dat, nrow) %>% sort()
save(dat, file="C:/Users/Andres/korpused/cinando/drafts.RData")
Note: not all films have all the metadata, many don’t even have a language entry (but hard to tell how much, as the maste Films table seems to have duplicates). Some films may be multilingual, other multi-entry ones are likely dubbed versions of the same film; so far no way to tell original language (besides comparing to country of origin, which is not precise). Most films list 1-2 languages.
# language one has multiple entries for same id and same lang
d0=dat$FilmLanguage %>% distinct()
d = d0 %>% count(refFilm)
g1 = ggplot(d, aes(x=n))+geom_bar(color="black") +labs(x="Lng per film", y="Count (note log axis)")+
scale_y_log10(expand=c(0,0.05), breaks = trans_breaks("log10", function(x) 10^x) #,
# sec.axis = sec_axis(~.,breaks = trans_breaks("log10", function(x) 10^x),
# labels = trans_format("log10", math_format(10^.x) ))
)+
scale_x_continuous(limits=c(0,max(d$n)),expand=c(0,0.1), breaks=c(1,5, seq(10,30,10)), labels=c(1,5, seq(10,30,10))) +
#scale_fill_gradient( high = "#BFBCFF", low="black" )+ , fill = log10(..count..)
annotation_logticks(sides="lr", size=0.2,
short = unit(.5,"mm"),
mid = unit(1,"mm"),
long = unit(2,"mm")) +
theme_bw()
#dat$FilmLanguage %>% filter(refFilm=="287598")%>% pull(txtLanguage) %>% table %>% sort()
d2 = d0 %>% mutate(txtLanguage = tolower(txtLanguage)) %>% count(txtLanguage) %>% arrange(desc(n))
d2$gr = c(rep(1,4), rep(5:10, each=45))[1:nrow(d2)]
g2 = ggplot(d2, aes(x=n,y = reorder(txtLanguage,n)))+
geom_bar(stat="identity", color="black")+
facet_wrap(~gr, nrow = 1, scales = "free")+
labs(y="", x="Number of entries of languages in the database (note the different scales)")+
theme_bw()+
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
axis.text.x = element_text(angle=45, hjust=1, vjust=1),
axis.text = element_text(size=7)
)
g1+g2+plot_layout(widths = c(0.1,0.9))
# dat$FilmGenre %>% filter(refFilm =="209517")
d0 = dat$FilmGenre %>% distinct()
d = d0 %>% count(refFilm)
g1 = ggplot(d, aes(x=n))+geom_bar(color="black") +labs(x="Genres per film", y="Count")+
scale_x_continuous(limits=c(0,max(d$n)),expand=c(0,0.1), breaks=c(1:max(d$n))) +
theme_bw()
d2 = d0 %>% count(txtKind) %>% arrange(desc(n))
d2$gr = c(rep(1,5), rep(2:3, each=18))[1:nrow(d2)]
g2 = ggplot(d2, aes(x=n,y = reorder(txtKind,n)))+
geom_bar(stat="identity", color="black")+
facet_wrap(~gr, nrow = 1, scales = "free")+
labs(y="", x="Number of entries of genres in the database (note the different scales)")+
theme_bw()+
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
axis.text.x = element_text(angle=45, hjust=1, vjust=1),
axis.text = element_text(size=7)
)
g1+g2+plot_layout(widths = c(0.1,0.9))
Most have 1, some have 2-3, but there are a small number with 10+.
# dat$FilmGenre %>% filter(refFilm =="209517")
d0 = dat$FilmOrigin %>% mutate(txtCountry=tolower(txtCountry)) %>% distinct()
d = d0 %>% count(refFilm)
g1 = ggplot(d, aes(x=n))+geom_bar(color="black") +labs(x="Countries per film", y="Count")+
#scale_x_continuous(limits=c(0,max(d$n)),expand=c(0.1,2), breaks=c(1:5, 10, max(d$n)+1)) +
theme_bw()
d2 = d0 %>% count(txtCountry) %>% arrange(desc(n))
d2$gr = c(rep(1,4), rep(2:3, each=50))[1:nrow(d2)]
g2 = ggplot(d2, aes(x=n,y = reorder(txtCountry,n)))+
geom_bar(stat="identity", color="black")+
facet_wrap(~gr, nrow = 1, scales = "free")+
labs(y="", x="Number of entries of genres in the database (note the different scales)")+
theme_bw()+
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
axis.text.x = element_text(angle=45, hjust=1, vjust=1),
axis.text = element_text(size=7)
)
g1+g2+plot_layout(widths = c(0.1,0.9))
This and the next sections are based on the main/biggest database table Films; it has duplicate entries we do not yet fully understand, as the same movie is often listed under multiple IDs. However, entries with duration value (n=141472) contain no duplicates in terms of ID (do contain Title_Year duplicates; 85460 left when those removed).
# you could also plot the following from the main film table: Duration, Budget, Seller, and then
# perhaps some quantitative stats such as the ones starting with Adm (for admissions), Bo (for box office), NBOfScreens (for the number of screens).
d = dat$Films %>% filter(!is.na(Duration)) %>% mutate(Duration=as.numeric(Duration), YearOfProduction=as.numeric(YearOfProduction)) %>% filter(!is.na(Duration)) %>%
filter(!duplicated(paste(TitleVA, YearOfProduction)) | !duplicated(paste(TitleVF, YearOfProduction))) %>%
filter(Duration<300) %>%
arrange(desc(YearOfProduction))
ggplot(d, aes(x=as.numeric(YearOfProduction)))+
geom_bar( color="black")+
labs(x="Year of production", y="Counts (note the log axis)")+
scale_x_continuous(breaks=c(seq(1900, 1990, 10), seq(2000, 2021, 3) ), sec.axis=sec_axis(~.,breaks=c(seq(1900, 1990, 10), seq(2000, 2021, 3) )))+
scale_y_log10(expand=c(0,0.05), breaks = trans_breaks("log10", function(x) 10^x))+
annotation_logticks(sides="lr", size=0.2,
short = unit(.5,"mm"),
mid = unit(1,"mm"),
long = unit(2,"mm")) +
theme_bw()+
theme(
axis.text.x = element_text(angle=90, hjust=1, vjust=0.5),
axis.text = element_text(size=7)
)
# setdiff(dat$Films$cFilm, d$cFilm) %>% length()
ggplot(d, aes(x=Duration))+
geom_boxplot(aes(x=Duration, y=-200),data=d, outlier.shape = NA, inherit.aes = F, width=200)+
geom_bar( color="black")+
#geom_density(color="black", fill="gray")
labs(x="Length in minutes (excluding a few outliers)", y="Count")+
scale_x_continuous(breaks=seq(0,300, 60))+
#geom_point(aes(y=0), data=d %>% filter(!duplicated(Duration)), shape=124, size=0.6)+
theme_bw()+
NULL
d2 = d %>% filter(YearOfProduction > 1950, YearOfProduction < 2022)
ggplot(d2, aes(y=Duration, x=YearOfProduction))+
geom_point(size=0.4, alpha=0.1, position=position_jitter(width = 0.5, height=5))+
#geom_density(color="black", fill="gray")
labs(y="Length in minutes", x="Length of films made after 1950")+
scale_y_continuous(breaks=seq(0,300, 60))+
scale_x_continuous(breaks=c(seq(1950, 2020, 10) ))+
#geom_point(aes(y=0), data=d %>% filter(!duplicated(Duration)), shape=124, size=0.6)+
theme_bw()+
NULL
Same as above; films with budget listed (n=29828) contain no duplicates. Units unknown (thousands, hundreds of thousands? Dollars?). The median Budget value is 1, and 9493 have it listed as 0 (the highest bar).
d = dat$Films %>% filter(!is.na(Budget)) %>% mutate(Budget=as.numeric(Budget)) %>% filter(!is.na(Budget)) %>% filter(Budget<150)
ggplot(d, aes(x=Budget))+
#geom_boxplot(aes(x=Budget, y=-200),data=d, outlier.shape = NA, inherit.aes = F, width=200)+
geom_bar( color="black") +
#geom_density(color="black", fill="gray")
labs(x="Budget (excluding a few outliers at ~200 and ~2000)", y="Count (note the log axis)")+
scale_y_log10(expand=c(0,0.05), breaks = trans_breaks("log10", function(x) 10^x))+
annotation_logticks(sides="lr", size=0.2,
short = unit(.5,"mm"),
mid = unit(1,"mm"),
long = unit(2,"mm")) +
#geom_point(aes(y=0), data=d %>% filter(!duplicated(Duration)), shape=124, size=0.6)+
theme_bw()+
NULL
Admissions info is present for only a few thousand films (first week admission: 5903, total: 1234, both: 963), but majority of values are actually just 0s.
Non-zero box office totals are available for 1263 films. Units unknown.
d = dat$Films %>% mutate(BoTotal = as.numeric(BoTotal)) %>% filter(!is.na(BoTotal) & BoTotal>0) %>% arrange(BoTotal)
ggplot(d, aes(y=BoTotal, x=1:nrow(d)))+
#geom_boxplot(aes(x=Budget, y=-200),data=d, outlier.shape = NA, inherit.aes = F, width=200)+
#geom_bar(stat="identity", color="black") +
geom_point(size=0.7)+
#geom_density(color="black", fill="gray")
labs(x="Ordered by box office total", y="Box office total (note the log scale)")+
scale_y_log10(expand=c(0,0.05), breaks = trans_breaks("log10", function(x) 10^x))+
annotation_logticks(sides="lr", size=0.2,
short = unit(.5,"mm"),
mid = unit(1,"mm"),
long = unit(2,"mm")) +
#geom_point(aes(y=0), data=d %>% filter(!duplicated(Duration)), shape=124, size=0.6)+
theme_bw()+
NULL
Non-zero screening numbers are available for 932 films.
d = dat$Films %>% mutate(NbOfScreens = as.numeric(NbOfScreens)) %>% filter(!is.na(NbOfScreens) & NbOfScreens>0) %>% arrange(NbOfScreens)
ggplot(d, aes(y=NbOfScreens, x=1:nrow(d)))+
#geom_boxplot(aes(x=Budget, y=-200),data=d, outlier.shape = NA, inherit.aes = F, width=200)+
#geom_bar(stat="identity", color="black") +
geom_point(size=0.7)+
#geom_density(color="black", fill="gray")
labs(x="Ordered by screenings", y="Screenings (note the log scale)")+
scale_y_log10(expand=c(0,0.05), breaks = trans_breaks("log10", function(x) 10^x))+
annotation_logticks(sides="lr", size=0.2,
short = unit(.5,"mm"),
mid = unit(1,"mm"),
long = unit(2,"mm")) +
#geom_point(aes(y=0), data=d %>% filter(!duplicated(Duration)), shape=124, size=0.6)+
theme_bw()+
NULL
Another section in the database, on Market Participation lists more data on individual films, going back to 2005.
d = dat$ScreeningsAtMarkets %>% group_by(MarketName, Year) %>% count()
ggplot(d, aes(x=Year, y=MarketName, group=MarketName, color=n, size=n))+
#geom_boxplot(aes(x=Budget, y=-200),data=d, outlier.shape = NA, inherit.aes = F, width=200)+
#geom_bar(stat="identity", color="black") +
geom_line(color="black", size=1)+
guides(size=F)+
geom_point()+
scale_colour_viridis_b(name="Number\nof\nscreenings", breaks=seq(0,1500,250))+
theme_bw()
library(rworldmap)
data("countryExData", envir = environment(), package = "rworldmap")
m = joinCountryData2Map(countryExData, joinCode = "ISO3", nameJoinColumn = "ISO3V10", mapResolution = "low") %>% fortify(mymap)
## 149 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 95 codes from the map weren't represented in your data
lats = dat$ScreeningsAtMarkets %>% filter(Year==2019) %>% group_by(MarketName) %>% summarise(latitude=mean(latitude, na.rm=T), longitude=mean(longitude, na.rm=T), n=n())
n1 = "gray90"; n2=viridis_pal(option="D")(9)[8]
barwidth = 2
barheight = 0.045
ggplot(lats, aes(x=longitude,y=latitude,label=MarketName))+
geom_polygon(data=m, aes(long, lat, group = group), inherit.aes = F)+
# geom_rect(data = lats,
# aes(xmin = longitude - barwidth,
# xmax = longitude + barwidth,
# ymin = latitude,
# ymax = latitude + max(n)*barheight), color=n1, fill=n1)+
geom_rect(data = lats,
aes(xmin = longitude - barwidth,
xmax = longitude + barwidth,
ymin = latitude,
ymax = latitude + n*barheight), color=n2, fill=n2) +
geom_text(angle=90, hjust=0,vjust=0.5, size=3 ) +
coord_cartesian( xlim=c(-150,150), ylim=c(-30, 100))+
labs(x="Number of screenings at 2019 festivals across the world", y="")+
theme_bw()+
theme( axis.text=element_blank(), axis.ticks =element_blank())
This could be also plotted as an actual network, where each node is a festival at a given year and edges mark films shared by festivals, but it just yields a massive hairball, so here’s a matrix instead, ordered by year (hover for labels).
d = dat$MarketParticipation %>% filter(!is.na(Year), Year < 2021) %>%
mutate(Name2 = paste(Year, Name, sep="_")) %>%
filter(!(Name2 %in% c("2009_Ventana Sur", "2010_RDV Unifrance") )) %>% group_by(refFilm, Name2) %>% distinct(refFilm, Name2, .keep_all=T)
d2 = crossprod(table(d %>% select(refFilm, Name2))) %>% as.matrix()
diag(d2) = NA
d3 = d2 %>% as.matrix() %>% reshape2::melt() %>% {colnames(.)=c("Var1", "Var2", "value");.}
d3 = d3 %>% mutate(festivals = paste("", Var1, Var2, value, sep="\n"))
g = ggplot(d3, aes(Var1, Var2, fill=value, label=festivals))+geom_tile() + scale_fill_viridis_c(trans="log10", breaks=10^(1:4), na.value = "white", name="Number\nof shared\nfilms") + theme_bw() +
theme(axis.title = element_blank(),
axis.text = element_text(size=3),
axis.text.x=element_text(angle=90, hjust=1, vjust=0.5))
ggplotly(g, tooltip="festivals")
#
# g = graph_from_adjacency_matrix(d2, mode="upper", weighted=T, diag=F)
# g2 = as_tbl_graph(x = g, directed = F)
#
#
# ggraph(g2, layout="linear", circular=T) +
# geom_edge_arc(aes(width=log10(weight)), color="black", strength = 0.1, alpha=0.2) +
# scale_edge_width(range=c(0.1,4))+
# geom_node_point(aes(size = local_size()-1), color="black", alpha=0.5) + # stats using tidygraph
# scale_size(range=c(1,3), name="n links") +
# geom_node_text(aes(label=name), hjust=0, nudge_x = 0.07, size=3 ) +
# theme_graph(base_family=NA, background = "white")+
# theme(legend.position = "none")
#
# # visEdges(arrows = "to", shadow=T, smooth=list(type="discrete"), selectionWidth=5)
# # visOptions(highlightNearest = list(enabled = T, hover = T, degree=1, labelOnly=F, algorithm="hierarchical"), nodesIdSelection = T) # interactive selection options
The Film Crews part of the database has 1068821 entries; including 965556 unique full entries, encompassing 213078 films and 286725 people (i.e. not accounting for possible namesakes; after excluding “NA” names and roles). Since that’s a bit much to put on a single plot, I’ll pick the 10 top most prolific people and plot the people in the movie crews they’ve been involved in. This yields a network of 2484 people (nodes are people, edges indicate they’ve cooperated on a movie). For people who appear in multiple roles, I’m coloring them by their most frequent role.
library(igraph)
library(ggraph)
library(tidygraph)
d = dat$FilmCrewOnFilms %>% mutate(name = tolower(paste(FirstName, LastName))) %>% select(name, refFilm, txtType) %>% filter(!(name %in% c("- -", "na na")), !is.na(txtType))
ggplot(d %>% count(txtType), aes(y=reorder(txtType, n), x=n ))+
#geom_boxplot(aes(x=Budget, y=-200),data=d, outlier.shape = NA, inherit.aes = F, width=200)+
geom_bar(stat="identity", color="black") +
#geom_density(color="black", fill="gray")
labs(x="Count", y="")+
theme_bw()
types = d %>% group_by(name) %>% count(txtType) %>% ungroup() %>% arrange(desc(n)) %>% group_by(name) %>% slice(1) %>% arrange(desc(n)) # matching with this assigns their most common role to person
d$type = types$txtType[match(d$name, types$name)]
d2 = d %>% select(-txtType) %>% dplyr::distinct()
d2$type[!(d2$type %in% c("Cast", "Director", "Producer", "Writer"))] = "other"
pop = table(d2$name) %>% sort() %>% tail(9) %>% names()
d3 = d2 %>% filter(refFilm %in% {d2 %>% filter(name %in% pop) %>% pull(refFilm)} )
dc = crossprod(table(d3 %>% select(refFilm,name))) %>% as.matrix()
diag(dc) = 0
g = graph_from_adjacency_matrix(dc, mode="upper", weighted=T, diag=F)
g2 = as_tbl_graph(x = g, directed = F) %>% activate(nodes) %>% mutate(role = d2$type[match(name, d2$name)] )
ggraph(g2, layout="fr") +
geom_edge_arc(width=0.1, color="gray40", strength = 0.1, alpha=0.1) +
scale_edge_width(range=c(0.1,4), guide=F)+
geom_node_point(aes( color=role), alpha=0.5,shape=16, size=0.7) + # stats using tidygraph
scale_size(range=c(1,3), name="n links", guide=F) +
geom_node_text( aes(label=name, color=role),data= function(g2){ g2 %>% filter(name %in% pop) %>% as.data.frame() %>% mutate(name=paste("<", name))}, hjust=0, size=5, repel=F) +
theme_graph(base_family=NA, background = "white")
#Could be interesting to match the language table with the origin table to see which countries make films in which languages
# Just added another table VZ_MarketsAtFestivals - this one lists all festival that we track plus any markets that happen at the festival (we track fewer markets than festivals). Using this, we could make a map of countries where most festivals happen or something (we do not have lat longs for the festivals as they tend to happen at different venues)
# problem: some names are possibly firstname-lastname confused e.g. kim ki-duk / ki-duk kim
library(Matrix)
library(quanteda)
d3 = d2 %>% filter(type %in% c("Director", "Producer")) %>% group_by(refFilm) %>% filter(n() > 1) %>% group_by(name) %>% filter(n() > 1) %>% ungroup() # 42055 unique
length(unique(d3$name))
dl = d3 %>% group_by(refFilm) %>% group_split() %>% lapply(function(x) pull(x, name))
dlm = tokens(dl) %>% dfm() %>% Matrix(sparse=T) %>% crossprod()
diag(dlm) = 0
dim(dlm)
g = graph_from_adjacency_matrix(dlm, mode="upper", weighted=T, diag=F)
g2 = as_tbl_graph(x = g, directed = F) %>% activate(nodes) %>% mutate(role = d3$type[match(name, d3$name)] )
g = ggraph(g2, layout="fr") +
geom_edge_arc(aes(alpha = weight, width=weight), color="gray40", strength = 0.1) +
geom_node_point(aes( color=role, size = centrality_pagerank()), alpha=0.4,shape=16) +
scale_edge_width(range=c(0.05,1), guide=F)+
scale_edge_alpha(range=c(0.1,0.3), guide=F)+
scale_size(range=c(0.3,2), name="n links", guide=F) +
theme_graph(base_family=NA, background = "white")+
theme(legend.position = "none")
#geom_node_text( aes(label=name, color=role),data= function(g2){ g2 %>% filter(name %in% pop) %>% as.data.frame() %>% mutate(name=paste("<", name))}, hjust=0, size=5, repel=F) +
ggsave("C:/Users/Andres/Dropbox/cudan/publicvalue/dir_prod_graph.png", g, device = "png", width = 15, height=15)
g = ggraph(g2, layout="fr") +
geom_edge_arc(aes(alpha = weight, width=weight), color="gray40", strength = 0.1) +
geom_node_point(aes( color=role, size = centrality_pagerank()), alpha=0.2,shape=16) +
scale_edge_width(range=c(0.05,1), guide=F)+
scale_edge_alpha(range=c(0.1,0.2), guide=F)+
scale_size(range=c(0.1,1), name="n links", guide=F) +
theme_graph(base_family=NA, background = "white")+
theme(legend.position = "none")
#geom_node_text( aes(label=name, color=role),data= function(g2){ g2 %>% filter(name %in% pop) %>% as.data.frame() %>% mutate(name=paste("<", name))}, hjust=0, size=5, repel=F) +
ggsave("C:/Users/Andres/Dropbox/cudan/publicvalue/dir_prod_graph2.png", g, device = "png", width = 15, height=15)
sleepnow()
But to have some sort of overview, here is a big plot of all 60min+ movies with a 500+ character synopsis and non-duplicate titles (n=26725), arranged based on a simple topic model of their synopsis texts (movies with similar stories are closer; dimension-reduced to 2D using UMAP).
library(quanteda)
library(quanteda.textmodels)
library(umap)
fg = dat$FilmGenre %>% group_by(txtKind) %>% mutate(n = n()) %>% arrange(desc(n))
d = dat$Films %>% filter(!is.na(Duration)) %>% mutate(Duration=as.numeric(Duration)) %>% filter(!is.na(Duration)) %>%
mutate(title2 = case_when(!is.na(TitleVF) ~ TitleVF, T~TitleVA)) %>%
filter(!duplicated(title2), !duplicated(cFilm)) %>% filter(cFilm %in% dat$FilmGenre$refFilm, Duration<300, Duration > 60, !is.na(SynopsisVF) | !is.na(SynopsisVA)) %>% mutate(genre =fg$txtKind[match(cFilm, fg$refFilm)]) %>%
mutate(text = paste(SynopsisVF, SynopsisVA, TitleVA, genre)) %>% filter(nchar(text)>500, nchar(text) < 3000, !is.na(genre))
d = mutate(d, country = dat$FilmOrigin$txtCountry[match(cFilm, dat$FilmOrigin$refFilm)])
df = tokens(d$text, remove_punct = T, remove_symbols = T, remove_numbers = T, remove_url = T, remove_separators = T) %>% dfm(tolower = T, stem = T, remove = c("NA", stopwords()) )
df = dfm_trim(df, min_termfreq = 100, max_docfreq = 0.1, docfreq_type="prop")
df = df %>% dfm_tfidf()
lsa = textmodel_lsa(df, 20)
# lsa$docs %>% class
u = umap(lsa$docs %>% as.matrix() )
u0 = u$layout
u = cbind(u0, d %>% select(TitleVA, genre, country, YearOfProduction))
u0 %>% plot(cex=0.1, ylim=c(-6,5), xlim=c(-17, 8))
u = u[u[,1] > -17 & u[,1] < 8 & u[,2] > -6 & u[,2] < 5, ]
colnames(u)[1:2] = c("v1", "v2")
u$film = paste("", u$TitleVA, u$genre, u$country, u$YearOfProduction, sep="\n")
u = mutate(u, genre2= case_when((genre %in% {table(u$genre) %>% sort %>% tail(5) %>% names()}) ~ as.character(genre), T~as.character(NA)) )
g = ggplot(u, aes(v1,v2, label=film, color=genre2))+
geom_point(size=0.2, shape=16)+
scale_colour_brewer(palette = "Set1", na.value="gray", breaks={table(u$genre) %>% sort %>% tail(5) %>% names()}, name="" )+
theme_bw()+
theme(axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title = element_blank()
)
p = ggplotly(g, tooltip="film") %>% toWebGL()
save(p, file="C:/Users/Andres/korpused/cinando/u.RData")
load("C:/Users/Andres/korpused/publicvalue/cinando/u.RData")
p
This has been just a quick overview, there’s plenty more info on crews, markets, movie metadata, synopsis texts etc to look into.
fg = dat$FilmGenre %>% group_by(txtKind) %>% mutate(n = n()) %>% arrange(desc(n))
d = dat$Films %>% mutate(Duration=as.numeric(Duration)) %>% mutate(genre =fg$txtKind[match(cFilm, fg$refFilm)])
d = mutate(d, country = dat$FilmOrigin$txtCountry[match(cFilm, dat$FilmOrigin$refFilm)])
d %>% filter(nchar(YearOfProduction)>3 & !is.na(YearOfProduction)) %>% mutate(YearOfProduction=as.numeric(YearOfProduction)) %>% mutate(id=paste(TitleVA, YearOfProduction)) %>%
arrange(YearOfProduction) %>%
filter(!duplicated(id), !duplicated(cFilm)) %>%
select(TitleVA, TitleVF, country, genre, YearOfProduction, SynopsisVA) %>% head(20) %>% as_tibble()
x = fg %>% group_by(refFilm) %>% group_split() %>% lapply(., function(x) pull(x, txtKind) )
x = tokens(x) %>% fcm(context="document", tri = F) %>% as.matrix()
diag(x) = 0
f = table(fg$txtKind)
x2 = x[match(names(sort(f, decreasing = T)), rownames(x)), match(names(sort(f, decreasing = T)), rownames(x))]
x2[1:5,1:5]
x2 = reshape2::melt(x2)
colnames(x2)[1:2] = c("v1", "v2")
ggplot(x2, aes(v1,v2, fill=value))+geom_tile() +
scale_fill_viridis_c(trans="log10", breaks=10^(1:4), na.value = "white", name="Number\nof shared\ngenre\ntags") +
theme_bw() +
theme(axis.title = element_blank(),
axis.text = element_text(size=8),
axis.text.x=element_text(angle=90, hjust=1, vjust=0.5))
fg$year = dat$Films$YearOfProduction[match(fg$refFilm, dat$Films$cFilm)] %>% as.numeric()
y = 1976:2020
g = unique(fg$txtKind)
fgr = tibble()
for(i in y){
x = data.frame(year=i, txtKind = g,
n = fg %>% filter(year == i) %>% .[match(g, .$txtKind), "n"]
)
x = x %>% mutate(rank= order(n, decreasing=T)) %>% mutate(rank=case_when(!is.na(n) ~ rank, T~length(g) ))
fgr = rbind(fgr, x)
}
ggplot(fgr, aes(y=rank, x=year, group=txtKind, color=txtKind))+
geom_line(alpha=0.5, size=1.3)+geom_point(size=0.7, alpha=0.5)+
annotate("rect", ymax=41, xmax=1979.8, ymin=0.6, xmin=1975, fill="white", color="white", alpha=0.9)+
geom_text(aes(label=txtKind), data=fgr %>% filter(year==2020), hjust=-0.1, size=3)+
geom_text(aes(label=txtKind), data=fgr %>% filter(year==1980), hjust=1.2, size=3)+
scale_x_continuous(breaks=seq(1980,2020,1))+
scale_y_reverse(lim=c(length(g), 0.5), breaks=c(1,10,20,30,40))+
scale_color_manual(values=scales::hue_pal(h = c(0,360), l = 60)(41) %>% sample())+
coord_cartesian(xlim=c(1978, 2022), ylim=c(41,1))+
theme_bw()+theme(legend.position = "none", axis.text.x = element_text(size=8, angle=90, hjust=0, vjust=0.5))
f = fg %>% filter(year>1930, year <2022) %>% group_by(txtKind, year) %>% count() %>% group_by(year) %>% mutate(n=n/sum(n, na.rm=T)) #%>% # mutate(n = case_when(n==0 ~ as.numeric(NA), T~n))
ggplot(f, aes(y=n, x=year, group=txtKind, color=txtKind))+geom_line()+geom_point(size=0.1)+
scale_x_continuous(breaks=seq(1950,2020,20))+
facet_wrap(~txtKind, scales="free_y")+labs(x="", y="% of tags in given year (note different scales)")+
theme_bw()+theme(legend.position = "none", axis.text = element_text(size=7))
ggplot(f %>% filter(txtKind=="Female director") , aes(y=n, x=year, group=txtKind))+geom_line()+geom_point(size=0.8)+
scale_x_continuous(breaks=seq(1987,2021,2))+
facet_wrap(~txtKind, scales="free_y")+
labs(x="", y="% of tags in given year")+
theme_bw()+theme(legend.position = "none", axis.text.x = element_text(size=8, angle=90, hjust=0, vjust=0.5))
cl=scales::hue_pal(h = c(0,360), l = 60)(41-5) %>% sample()
fgr2 = fgr %>% filter(!(txtKind %in% c("Drama", "Comedy", "Documentary", "Thriller", "Action/Adventure" ) )) %>% mutate(n=case_when(is.na(n)~as.integer(0), T~n)) %>% group_by(year) %>% mutate(n=n/sum(n, na.rm = T)) %>% group_by(txtKind) %>% mutate(o=max(n, na.rm=T)) %>% arrange(o)
ggplot(fgr2, aes(y=n, x=year, fill=reorder(txtKind, o), color=reorder(txtKind, o)))+
geom_area(stat="identity", color="white", size=0.1)+
annotate("rect", ymax=1, xmax=1979.8, ymin=0, xmin=1975, fill="white", color="white", alpha=0.9)+
geom_text(aes(y=nc, label=txtKind), data=fgr2 %>% ungroup() %>% filter(year==2020) %>% mutate(nc=1-(cumsum(n))), hjust=-0.1, size=3, vjust=-0.2)+
geom_text(aes(y=nc, label=txtKind), data=fgr2 %>% ungroup() %>% filter(year==1980) %>% mutate(nc=1-(cumsum(n))), hjust=1.2, size=3, vjust=-0.2)+
scale_x_continuous(breaks=seq(1980,2020,1))+
#scale_y_reverse(lim=c(length(g), 0.5), breaks=c(1,10,20,30,40))+
labs(y="", x="Same thing but without the top 5 big stable categories")+
scale_fill_manual(values=cl)+
scale_color_manual(values=cl)+
coord_cartesian(xlim=c(1978, 2022), ylim=c(0,1))+
theme_bw()+theme(legend.position = "none", axis.text.x = element_text(size=8, angle=90, hjust=0, vjust=0.5))